#!/bin/bash
#SBATCH --ntasks-per-node=40
#SBATCH --nodes=1
#SBATCH --time=5:00:00

cd $SLURM_SUBMIT_DIR


module load biobuilds/2017.05

REF=../HUMAN_REFS/HG38/hs38.fa
DBSITES=../HUMAN_REFS/HG38/common_dbsnp_151.hg38.vcf

#variable file contains $sampleID and location of fastq files 
. variables


##########################
#Concatentate Multilane FQ Files if required
##########################
pair1_fqs=($(grep 'pair1' variables | cut -d '=' -f 2))
pair2_fqs=($(grep 'pair2' variables | cut -d '=' -f 2))

for f in "${pair1_fqs[@]}"; do
    zcat "${f//\"/}" >> "$sampleID"_cat1.fq
done

for f in "${pair2_fqs[@]}"; do
    zcat "${f//\"/}" >> "$sampleID"_cat2.fq
done


###########################################
# aligning fastq files  with the reference genome using bwa-mem
# -t = Threads -M = flag shorter split hits as secondary
# -R = Readgroups -O = Gap open penalty -E =Gap extension penalty
###########################################

bwa mem \
	-t 40 \
	-M \
	-R '@RG\tID:'$sampleID'_lane1\tSM:'$sampleID'\tPL:ILLUMINA\tLB:Library' \
	$REF \
	"$sampleID"_cat1.fq "$sampleID"_cat2.fq \
	> "$sampleID"_aligned.sam

## Remove contactenated FastQ files if they were generated. 
rm "$sampleID"_cat1.fq "$sampleID"_cat2.fq


######################
# convert SAM to BAM
######################

samtools view -bS -@ 40 -o "$sampleID"_aligned.bam "$sampleID"_aligned.sam

rm "$sampleID"_aligned.sam

module unload biobuilds/2017.05

######################
# Picard tools 
######################

module load picard


# sort bam file
picard SortSam \
	INPUT="$sampleID"_aligned.bam \
	OUTPUT="$sampleID"_aligned_sorted.bam \
	SORT_ORDER=coordinate \
	TMP_DIR=tmp1 \
	VALIDATION_STRINGENCY=SILENT \
	MAX_RECORDS_IN_RAM=2500000

rm "$sampleID"_aligned.bam

# mark duplicates
picard MarkDuplicates \
	INPUT="$sampleID"_aligned_sorted.bam \
	METRICS_FILE="$sampleID"_dup_metrics \
	OUTPUT="$sampleID"_marked_dups_sorted.bam \
	TMP_DIR=tmp1 \
	VALIDATION_STRINGENCY=SILENT \
	MAX_RECORDS_IN_RAM=2500000

rm "$sampleID"_aligned_sorted.bam
rm -fr tmp1

# Sort BAM file
picard SortSam \
	INPUT="$sampleID"_marked_dups_sorted.bam \
	OUTPUT="$sampleID".DelDup.bam \
	SORT_ORDER=coordinate \
	TMP_DIR=tmp2 \
	VALIDATION_STRINGENCY=SILENT \
	MAX_RECORDS_IN_RAM=2500000

rm "$sampleID"_marked_dups_sorted.bam
rm -fr tmp2

# Index BAM file
picard BuildBamIndex \
	INPUT="$sampleID".DelDup.bam \
	TMP_DIR=tmp3 \
	VALIDATION_STRINGENCY=SILENT

# Fix mate pair information by picard
picard FixMateInformation \
	INPUT="$sampleID".DelDup.bam \
	OUTPUT="$sampleID".GATK.fixedmateinfo.bam \
	SORT_ORDER=coordinate \
	TMP_DIR=tmp3 \
	VALIDATION_STRINGENCY=SILENT \
	MAX_RECORDS_IN_RAM=500000 \
	CREATE_INDEX=true



rm "$sampleID".DelDup.bam "$sampleID".DelDup.bai
rm -fr tmp3

module unload picard
 
##########################################################################################
# GATK BQRS; NB: Indel realignment not required in using HaplotypeCaller downstream
##########################################################################################

module load biobuilds/2017.05
module load GATK/4.0.7

# Recalibratiing base quality (longer: more than 60 minutes) 
# if it fails, might be an old Illumina sample-> add -fixMisencodedQuals
java -Xmx150G -jar /local/software/GATK/gatk-4.0.7.0/gatk-package-4.0.7.0-local.jar BaseRecalibrator \
	-I "$sampleID".GATK.fixedmateinfo.bam \
	-R $REF \
	--known-sites $DBSITES \
	-O "$sampleID".recal_data.table

# Shorter: less than 15 minutes
java -Xmx150G -jar /local/software/GATK/gatk-4.0.7.0/gatk-package-4.0.7.0-local.jar ApplyBQSR \
	-R $REF \
	-I "$sampleID".GATK.fixedmateinfo.bam \
	--bqsr-recal-file "$sampleID".recal_data.table \
	-O "$sampleID".GATK.recal.bam \


##############################
# Remove reference files, Penultimate BAM. Picard tmp directories
##############################

rm "$sampleID".GATK.fixedmateinfo.bam "$sampleID".GATK.fixedmateinfo.bai
